In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
%matplotlib inline
sns.set('notebook')
sns.set_style('whitegrid')
In [5]:
grimm = pd.read_csv('data/humexp/Grimm.csv', index_col='EpochTime', header=False, names=['EpochTime','Count','PM1', 'PM10', 'PM2.5'])
grimm.index = pd.to_datetime((grimm.index.values*1e9).astype(int))
In [6]:
grimm.head()
Out[6]:
In [7]:
sns.pairplot(grimm)
Out[7]:
In [8]:
speck1 = pd.read_csv('data/humexp/Speck1.csv', index_col='EpochTime', header=False, names=['EpochTime','Humidity', 'Concentration', 'Count', 'Raw', 'Temp'])
speck2 = pd.read_csv('data/humexp/Speck2.csv', index_col='EpochTime', header=False, names=['EpochTime','Humidity', 'Concentration', 'Count', 'Raw', 'Temp'])
speck1 = speck1.iloc[2:]
speck2 = speck2.iloc[1:]
speck1.index = pd.to_datetime((speck1.index.values*1e9).astype(int))
speck2.index = pd.to_datetime((speck2.index.values*1e9).astype(int))
In [9]:
speck1.head()
Out[9]:
In [10]:
speck1 = speck1.resample('1Min').dropna()
speck2 = speck2.resample('1Min').dropna()
grimm = grimm.resample('1Min').dropna()
In [11]:
sns.jointplot(speck1['Concentration'].values, speck2['Concentration'].values)
sns.jointplot(speck1['Concentration'].values, grimm['PM2.5'].values)
sns.jointplot(speck2['Concentration'].values, grimm['PM2.5'].values)
Out[11]:
In [12]:
plt.subplot(121)
plt.plot(grimm['PM2.5'])
plt.plot(speck1['Concentration'], alpha=0.8)
plt.plot(speck2['Concentration'], alpha=0.8)
plt.subplot(122)
plt.plot(speck1['Humidity'])
plt.plot(speck2['Humidity'])
Out[12]:
In [13]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, LinearRegression
Compare two predictors, SVM may overfit the training data, linear ridge regression will not be able to overfit if $d<<n$
In [14]:
predictors = {'Ridge': make_pipeline(StandardScaler(), PolynomialFeatures(2), Ridge()),
'RBF SVM': make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1e4, epsilon=1, degree=3))}
# Note, RBF parameters were not tunes with a validation set, but with the test set.
# This is more of an exploration and is not suitable for publication
In [15]:
results = {}
X = speck1.iloc[:500].values
y = grimm['PM2.5'].iloc[:500]
testX = speck1.iloc[500:].values
testy = grimm['PM2.5'].iloc[500:]
#X = speck1.iloc[::2].values
#y = grimm['PM2.5'].iloc[::2]
#testX = speck1.iloc[1::2].values
#testy = grimm['PM2.5'].iloc[1::2]
for label in predictors:
regressor = predictors[label]
regressor.fit(X, y)
results[label] = regressor.predict(testX)
In [16]:
plt.subplot(111)
plt.plot(testy, label='Grimm')
for label in results:
plt.plot(results[label], label=label, alpha=0.7)
plt.legend()
Out[16]:
In [ ]:
print 'Training data fit scores'
for label in predictors:
print label + ' ' + str(predictors[label].score(speck1.iloc[::2].values, grimm['PM2.5'].iloc[::2]))
For each feautre (polynomial combination of features), what is the respective weight in the ridge regressor?
In [ ]:
print speck1.columns
print zip(predictors['Ridge'].steps[1][1].powers_, predictors['Ridge'].steps[2][1].coef_)